#' @rdname DESeqDataSet
#' @export
setClass("DESeqDataSet",
         contains = "RangedSummarizedExperiment",
         representation = representation( 
           design = "ANY",
           dispersionFunction = "function"))

setValidity("DESeqDataSet", function(object) {
  if (! ("counts" %in% assayNames(object)) )
    return( "the assays slot must contain a matrix named 'counts'" )
  if ( !is.numeric( counts(object) ) )
    return( "the count data is not numeric" )
  if ( any( is.na( counts(object) ) ) )
    return( "NA values are not allowed in the count matrix" )
  if ( !is.integer( counts(object) ) )
    return( "the count data is not in integer mode" )
  if ( any( counts(object) < 0 ) )
    return( "the count data contains negative values" )

  design <- design(object)
  # 'design' is either a formula or matrix
  stopifnot(is(design, "formula") | is(design, "matrix"))

  if (is(design, "formula")) {
    designVars <- all.vars(design)
    if (!all(designVars %in% names(colData(object)))) {
      return("all variables in design formula must be columns in colData")
    }
    designVarsClass <- sapply(designVars, function(v) class(colData(object)[[v]]))
    if (any(designVarsClass == "character")) {
      return("variables in design formula are character vectors.
  convert these columns of colData(object) to factors before including in the design formula")
    }
    designFactors <- designVars[designVarsClass == "factor"]
    # levels would duplicate after make.names()
    if (any(sapply(designFactors,function(v) {
      factor.lvls <- levels(colData(object)[[v]])
      factor.nms <- make.names(factor.lvls)
      any(duplicated(factor.nms))
    }))) {
      return("levels of factors in the design have non-unique level names after make.names() is applied.
  best to only use letters and numbers for levels of factors in the design")
    }
    # levels contain characters other than letters, numbers, and underscore
    if (any(sapply(designFactors,function(v) {
      factor.lvls <- levels(colData(object)[[v]])
      any(!grepl("^[A-Za-z0-9_.]+$",factor.lvls))
    }))) {
      # just a warning for now
      message("  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbers, and delimiters '_' or '.', as these are safe characters
  for column names in R. [This is a message, not a warning or an error]")
    }
  } else if (is(design, "matrix")) {
    # TODO add some more tests for if 'design' is matrix
    stopifnot(nrow(design) == ncol(object))
  }
  
  TRUE
})
  
#' DESeqDataSet object and constructors
#'
#' \code{DESeqDataSet} is a subclass of \code{RangedSummarizedExperiment},
#' used to store the input values, intermediate calculations and results of an
#' analysis of differential expression.  The \code{DESeqDataSet} class
#' enforces non-negative integer values in the "counts" matrix stored as
#' the first element in the assay list.
#' In addition, a formula which specifies the design of the experiment must be provided.
#' The constructor functions create a DESeqDataSet object
#' from various types of input:
#' a RangedSummarizedExperiment, a matrix, count files generated by
#' the python package HTSeq, or a list from the tximport function in the
#' tximport package.
#' See the vignette for examples of construction from different types.
#'
#' Note on the error message "assay colnames() must be NULL or equal colData rownames()":
#' this means that the colnames of countData are different than the rownames of colData.
#' Fix this with: \code{colnames(countData) <- NULL}
#'
#' @param se a \code{RangedSummarizedExperiment} with columns of variables
#' indicating sample information in \code{colData},
#' and the counts as the first element in the assays list, which will
#' be renamed "counts". A \code{RangedSummarizedExperiment} object can be
#' generated by the function \code{summarizeOverlaps} in the GenomicAlignments
#' package.
#' @param design a \code{formula} or \code{matrix}.
#' the \code{formula} expresses how the counts for each gene
#' depend on the variables in \code{colData}. Many R \code{formula} are valid,
#' including designs with multiple variables, e.g., \code{~ group + condition},
#' and designs with interactions, e.g., \code{~ genotype + treatment + genotype:treatment}.
#' See \code{\link{results}} for a variety of designs and how to extract results tables.
#' By default, the functions in this package will use 
#' the last variable in the formula for building results tables and plotting.
#' \code{~ 1} can be used for no design, although users need to remember
#' to switch to another design for differential testing.
#' @param countData for matrix input: a matrix of non-negative integers
#' @param colData for matrix input: a \code{DataFrame} or \code{data.frame} with at least a single column.
#' Rows of colData correspond to columns of countData
#' @param tidy for matrix input: whether the first column of countData is the rownames for the count matrix
#' @param sampleTable for htseq-count: a \code{data.frame} with three or more columns. Each row
#' describes one sample. The first column is the sample name, the second column
#' the file name of the count file generated by htseq-count, and the remaining
#' columns are sample metadata which will be stored in \code{colData}
#' @param txi for tximport: the simple list output of the \code{tximport} function
#' @param directory for htseq-count: the directory relative to which the filenames are specified. defaults to current directory
#' @param ignoreRank use of this argument is reserved for DEXSeq developers only.
#' Users will immediately encounter an error upon trying to estimate dispersion
#' using a design with a model matrix which is not full rank.
#' @param ... arguments provided to \code{SummarizedExperiment} including rowRanges and metadata. Note that
#' for Bioconductor 3.1, rowRanges must be a GRanges or GRangesList, with potential metadata columns
#' as a DataFrame accessed and stored with \code{mcols}. If a user wants to store metadata columns
#' about the rows of the countData, but does not have GRanges or GRangesList information,
#' first construct the DESeqDataSet without rowRanges and then add the DataFrame with \code{mcols(dds)}.
#' 
#' @return A DESeqDataSet object.
#' 
#' @aliases DESeqDataSet DESeqDataSet-class DESeqDataSetFromMatrix DESeqDataSetFromHTSeqCount
#'
#' @references See \url{http://www-huber.embl.de/users/anders/HTSeq} for htseq-count
#'
#' @docType class
#'
#' @examples
#'
#' countData <- matrix(1:100,ncol=4)
#' condition <- factor(c("A","A","B","B"))
#' dds <- DESeqDataSetFromMatrix(countData, DataFrame(condition), ~ condition)
#'
#' @rdname DESeqDataSet
#' @importFrom utils packageVersion
#' @export
DESeqDataSet <- function(se, design, ignoreRank=FALSE) {
  if (!is(se, "RangedSummarizedExperiment")) {
    if (is(se, "SummarizedExperiment")) {
      se <- as(se, "RangedSummarizedExperiment")
    } else {
      stop("'se' must be a RangedSummarizedExperiment object")
    }
  }
  if (is.null(assayNames(se)) || assayNames(se)[1] != "counts") {
    message("renaming the first element in assays to 'counts'")
    assayNames(se)[1] <- "counts"
  }
  # special tximeta processing
  if ("tximetaInfo" %in% names(metadata(se))) {
    se <- processTximeta(se)
  }  
  # before validity check, try to convert assay to integer mode
  if (any(is.na(assay(se))))
    stop("NA values are not allowed in the count matrix")
  if (any(assay(se) < 0)) {
    stop("some values in assay are negative")
  }
  if (!is.integer(assay(se))) {
    if (!is.numeric(assay(se))) {
      stop(paste("counts matrix should be numeric, currently it has mode:", mode(assay(se))))
    }
    if (any(round(assay(se)) != assay(se))) {
      stop("some values in assay are not integers")
    }
    message("converting counts to integer mode")
    mode(assay(se)) <- "integer"
  }

  if (all(assay(se) == 0)) {
    stop("all samples have 0 counts for all genes. check the counting script.")
  }
  
  if (all(rowSums(assay(se) == assay(se)[,1]) == ncol(se))) {
    warning("all genes have equal values for all samples. will not be able to perform differential analysis")
  }

  if (any(duplicated(rownames(se)))) {
    warning(sum(duplicated(rownames(se)))," duplicate rownames were renamed by adding numbers")
    rnms <- rownames(se)
    dups <- unique(rnms[duplicated(rnms)])
    for (rn in dups) {
      idx <- which(rnms == rn)
      rnms[idx[-1]] <- paste(rnms[idx[-1]], c(seq_len(length(idx) - 1)), sep=".")
    }
    rownames(se) <- rnms
  }

  if (is(design, "formula")) {
    designVars <- all.vars(design)
    if (!all(designVars %in% names(colData(se)))) {
      stop("all variables in design formula must be columns in colData")
    }
    for (v in designVars) {
      if (any(is.na(colData(se)[[v]])))
        stop(paste("variables in design formula cannot contain NA:", v))
    }
    
    designVarsClass <- sapply(designVars, function(v) class(colData(se)[[v]])[1])

    if (any(designVarsClass == "character")) {
      warning("some variables in design formula are characters, converting to factors")
      for (v in designVars[designVarsClass == "character"]) {
        colData(se)[[v]] <- factor(colData(se)[[v]])
      }
    }
    
    if (length(designVars) == 1) {
      var <- colData(se)[[designVars]]
      if (all(var == var[1])) {
        stop("design has a single variable, with all samples having the same value.
  use instead a design of '~ 1'. estimateSizeFactors, rlog and the VST can then be used")
      }
    }

    designVarsNumeric <- sapply(designVars, function(v) is.numeric(colData(se)[[v]]))
    if (any(designVarsNumeric)) {
      msgIntVars <- FALSE
      msgCenterScale <- FALSE
      for (v in designVars[designVarsNumeric]) {
        if (all(colData(se)[[v]] == round(colData(se)[[v]]))) {
          msgIntVars <- TRUE
        }
        if (mean(colData(se)[[v]]) > 5 | sd(colData(se)[[v]]) > 5) {
          msgCenterScale <- TRUE
        }
      }
      if (msgIntVars) {
        message("  the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function")
      }
      if (msgCenterScale) {
        message("  the design formula contains one or more numeric variables that have mean or
  standard deviation larger than 5 (an arbitrary threshold to trigger this message).
  it is generally a good idea to center and scale numeric variables in the design
  to improve GLM convergence.")
      }
    }

    if (any(designVarsClass == "ordered")) {
      stop("the design formula contains an ordered factor. The internal steps
do not work on ordered factors as a formula. Instead you should provide a matrix to
the 'design' slot or to the 'full' argument of DESeq(), constructed using model.matrix.")
    }
    
    designFactors <- designVars[designVarsClass == "factor"]
    missingLevels <- sapply(designFactors, function(v) any(table(colData(se)[[v]]) == 0))
    if (any(missingLevels)) {
      message("factor levels were dropped which had no samples")
      for (v in designFactors[missingLevels]) {
        colData(se)[[v]] <- droplevels(colData(se)[[v]])
      }
    }
    
    singleLevel <- sapply(designFactors, function(v) all(colData(se)[[v]] == colData(se)[[v]][1]))
    if (any(singleLevel)) {
      stop("design contains one or more variables with all samples having the same value,
  remove these variables from the design")
    }
    
    # if the last variable in the design formula is a
    # factor, and has a level 'control', check if it is
    # the reference level and if not print a message
    lastDV <- length(designVars)
    if (length(designVars) > 0 && designVarsClass[lastDV] == "factor") {
      lastDVLvls <- levels(colData(se)[[designVars[lastDV]]])
      controlSynonyms <- c("control","Control","CONTROL")
      for (cSyn in controlSynonyms) {
        if (cSyn %in% lastDVLvls) {
          if (cSyn != lastDVLvls[1]) {
            message(paste0("  it appears that the last variable in the design formula, '",designVars[lastDV],"',
  has a factor level, '",cSyn,"', which is not the reference level. we recommend
  to use factor(...,levels=...) or relevel() to set this as the reference level
  before proceeding. for more information, please see the 'Note on factor levels'
  in vignette('DESeq2')."))
          }
        }
      }
    }

    modelMatrix <- stats::model.matrix.default(design, data=as.data.frame(colData(se)))
  } else if (is(design, "matrix")) {
    modelMatrix <- design
  } else {
    stop("'design' should be a formula or a matrix")
  }

  if (!ignoreRank) {
    checkFullRank(modelMatrix)
  }
  
  # Add columns on the columns
  mcolsCols <- DataFrame(type=rep("input",ncol(colData(se))),
                         description=rep("",ncol(colData(se))))
  mcols(colData(se)) <- if (is.null(mcols(colData(se)))) {
    mcolsCols
  } else if (all(names(mcols(colData(se))) == c("type","description"))) {
    mcolsCols
  } else {
    cbind(mcols(colData(se)), mcolsCols)
  }
  
  object <- new("DESeqDataSet", se, design = design)
                                 
  # now we know we have at least an empty GRanges or GRangesList for rowRanges
  # so we can create a metadata column 'type' for the mcols
  # and we label any incoming columns as 'input'

  # this is metadata columns on the rows
  mcolsRows <- DataFrame(type=rep("input",ncol(mcols(object))),
                         description=rep("",ncol(mcols(object))))
  mcols(mcols(object)) <- if (is.null(mcols(mcols(object)))) {
    mcolsRows
  } else if (all(names(mcols(mcols(object))) == c("type","description"))) {
    mcolsRows
  } else {
    cbind(mcols(mcols(object)), mcolsRows)
  }

  # stash the package version
  metadata(object)[["version"]] <- packageVersion("DESeq2")
  
  return(object)
}

#' @rdname DESeqDataSet
#' @export
DESeqDataSetFromMatrix <- function( countData, colData, design, tidy=FALSE, ignoreRank=FALSE, ... )
{
  
  if (tidy) {
    stopifnot(ncol(countData) > 1)
    rownms <- as.character(countData[,1])
    countData <- countData[,-1,drop=FALSE]
    rownames(countData) <- rownms
  }

  # check that these agree in number
  stopifnot(ncol(countData) == nrow(colData))

  if (is(countData, "data.frame")) {
    if (any(sapply(countData, is, "factor"))) {
      warning("\n\n  'countData' is a data.frame with one or more factor columns.
  Be aware that converting directly to numeric can lead to errors.
  Provide matrices of integers to avoid this error.")
    }
  }
  
  # we expect a matrix of counts, which are non-negative integers
  countData <- as.matrix( countData )

  if (is(colData,"data.frame"))
    colData <- as(colData, "DataFrame")
  
  # check if the rownames of colData are simply in different order
  # than the colnames of the countData, if so throw an error
  # as the user probably should investigate what's wrong
  if (!is.null(rownames(colData)) & !is.null(colnames(countData))) {
    if (all(sort(rownames(colData)) == sort(colnames(countData)))) {
      if (!all(rownames(colData) == colnames(countData))) {
        stop(paste("rownames of the colData:
  ",paste(rownames(colData),collapse=","),"
  are not in the same order as the colnames of the countData:
  ",paste(colnames(countData),collapse=",")))
      }
    }
  }
  if (is.null(rownames(colData)) & !is.null(colnames(countData))) {
    rownames(colData) <- colnames(countData)
  }
  
  se <- SummarizedExperiment(assays = SimpleList(counts=countData), colData = colData, ...)
  object <- DESeqDataSet(se, design = design, ignoreRank)

  return(object)
}

#' @rdname DESeqDataSet
#' @export
DESeqDataSetFromHTSeqCount <- function( sampleTable, directory=".", design, ignoreRank=FALSE, ...) 
{
  if (missing(design)) stop("design is missing")
  l <- lapply( as.character( sampleTable[,2] ), function(fn) read.table( file.path( directory, fn ), fill=TRUE ) )
  if( ! all( sapply( l, function(a) all( a$V1 == l[[1]]$V1 ) ) ) )
    stop( "Gene IDs (first column) differ between files." )
  # select last column of 'a', works even if htseq was run with '--additional-attr'
  tbl <- sapply( l, function(a) a[,ncol(a)] )
  colnames(tbl) <- sampleTable[,1]
  rownames(tbl) <- l[[1]]$V1
  rownames(sampleTable) <- sampleTable[,1]
  oldSpecialNames <- c("no_feature","ambiguous","too_low_aQual","not_aligned","alignment_not_unique")
  # either starts with two underscores
  # or is one of the old special names (htseq-count backward compatability)
  specialRows <- (substr(rownames(tbl),1,1) == "_") | rownames(tbl) %in% oldSpecialNames
  tbl <- tbl[ !specialRows, , drop=FALSE ]
  object <- DESeqDataSetFromMatrix(countData=tbl,colData=sampleTable[,-(1:2),drop=FALSE],design=design,ignoreRank, ...)
  return(object)
}   

#' @rdname DESeqDataSet
#' @export
DESeqDataSetFromTximport <- function(txi, colData, design, ...) 
{
  stopifnot(is(txi, "list"))
  counts <- round(txi$counts)
  mode(counts) <- "integer"
  object <- DESeqDataSetFromMatrix(countData=counts, colData=colData, design=design, ...)
  stopifnot(txi$countsFromAbundance %in% c("no","scaledTPM","lengthScaledTPM"))
  if (txi$countsFromAbundance %in% c("scaledTPM","lengthScaledTPM")) {
    message("using just counts from tximport")
  } else {
    message("using counts and average transcript lengths from tximport")
    lengths <- txi$length
    stopifnot(all(lengths > 0))
    dimnames(lengths) <- dimnames(object)
    assays(object)[["avgTxLength"]] <- lengths
  }
  return(object)
}   


#' @rdname DESeqResults
#' @export
setClass("DESeqResults",
         contains="DFrame",
         representation = representation( 
           priorInfo = "list")
         )

#' DESeqResults object and constructor
#'
#' This constructor function would not typically be used by "end users".
#' This simple class indirectly extends the DataFrame class defined in the
#' S4Vectors package to allow other packages to write methods for results
#' objects from the DESeq2 package. It is used by \code{\link{results}}
#' to wrap up the results table.
#'
#' @param DataFrame a DataFrame of results, standard column names are:
#' baseMean, log2FoldChange, lfcSE, stat, pvalue, padj.
#' @param priorInfo a list giving information on the log fold change prior
#'
#' @return a DESeqResults object
#' @docType class
#' @aliases DESeqResults-class
#' @rdname DESeqResults
#' @export
DESeqResults <- function(DataFrame, priorInfo=list()) {
  new("DESeqResults", DataFrame, priorInfo=priorInfo)
}

#' @rdname DESeqTransform
#' @export
setClass("DESeqTransform", contains="RangedSummarizedExperiment")

#' DESeqTransform object and constructor
#'
#' This constructor function would not typically be used by "end users".
#' This simple class extends the RangedSummarizedExperiment class of the
#' SummarizedExperiment package.
#' It is used by \code{\link{rlog}} and
#' \code{\link{varianceStabilizingTransformation}}
#' to wrap up the results into a class for downstream methods,
#' such as \code{\link{plotPCA}}.
#' 
#' @param SummarizedExperiment a RangedSummarizedExperiment
#'
#' @return a DESeqTransform object
#' @docType class
#' @aliases DESeqTransform-class
#' @rdname DESeqTransform
#' @export
DESeqTransform <- function(SummarizedExperiment) {
  se <- SummarizedExperiment
  if (!is(se, "RangedSummarizedExperiment")) {
    if (is(se, "SummarizedExperiment")) {
      se <- as(se, "RangedSummarizedExperiment")
    } else {
      stop("'SummarizedExperiment' must be a RangedSummarizedExperiment object")
    }
  }
  new("DESeqTransform", se)
}

# unexported helper function
processTximeta <- function(se) {
  assay(se) <- round(assay(se))
  mode(assay(se)) <- "integer"
  stopifnot("countsFromAbundance" %in% names(metadata(se)))
  cfa <- metadata(se)$countsFromAbundance
  stopifnot(cfa %in% c("no","scaledTPM","lengthScaledTPM"))  
  if (cfa %in% c("scaledTPM","lengthScaledTPM")) {
    message("using just counts from tximeta")
  } else {
    message("using counts and average transcript lengths from tximeta")
    stopifnot(all(assays(se)$length > 0))
    anms <- assayNames(se)
    assayNames(se)[anms == "length"] <- "avgTxLength"
  }
  return(se)
}
